import numpy as np
import pandas as pd
n_cups = 20
n_milk_first = 10
opposite = n_cups - n_milk_first
P = np.concatenate((np.ones(n_milk_first),np.zeros(opposite)),axis= 0 ) #axis=0
B = 100000
milk_estimates = pd.DataFrame({"milk_estimate" :np.zeros(B)}) #딕셔너리를 데이터로 전달
for i in range (B):
milk_first_choice = np.random.choice(P,n_milk_first,replace= False )
milk_estimates.loc[i,"milk_estimate" ] = np.sum (milk_first_choice)
milk_estimates.head(10 )
milk_estimate
0
6.0
1
6.0
2
4.0
3
5.0
4
6.0
5
5.0
6
4.0
7
5.0
8
4.0
9
5.0
milk_estimates.plot.hist()
<AxesSubplot:ylabel='Frequency'>
observed_val = 8
a,b = milk_estimates[milk_estimates["milk_estimate" ] >= observed_val].shape
a
n_cups = 8
n_milk_first = 4
opposite = n_cups - n_milk_first
P = np.concatenate((np.ones(n_milk_first),np.zeros(opposite)))
observed_val = 4
B = 1000000
estimates = pd.DataFrame({"milk_estimates" :np.zeros(B)})
for i in range (B):
sample_milk = np.random.choice(P,n_milk_first,replace= False )
estimates.loc[i,"milk_estimates" ] = np.sum (sample_milk)
a,b = estimates[estimates["milk_estimates" ]>= observed_val].shape
p_value = a/ B
print (p_value)
B = 100000
observed_val = 620
sample = np.random.binomial(1000 ,0.54 ,B)
p_value = np.sum (sample >= observed_val)/ B
p_value
import scipy.stats as stats
binomial_dist = stats.binom(1000 ,0.54 )
1 - binomial_dist.cdf(619 )
url = "https://ilovedata.github.io/teaching/bigdata2/data/physical_test_2018_data.csv"
physical_data = pd.read_csv(url)
twogroup = physical_data[["TEST_AGE" , "TEST_SEX" , "ITEM_F001" , "ITEM_F002" ]].rename(columns = {"ITEM_F001" :"height" ,"ITEM_F002" :"weight" })
twogroup.head(10 )
TEST_AGE
TEST_SEX
height
weight
0
33
M
159.2
57.2
1
48
F
155.8
52.9
2
22
M
175.2
96.2
3
29
M
178.7
79.4
4
31
F
160.1
50.2
5
23
F
157.8
60.1
6
11
M
165.5
60.3
7
24
M
174.9
74.5
8
18
M
181.0
71.3
9
41
F
160.6
72.7
#randomnization
a,b = twogroup.shape
treatment = np.array([np.random.choice(["A" ,"B" ]) for i in np.arange(a)])
twogroup["treatment" ] = treatment
twogroup.head(10 )
TEST_AGE
TEST_SEX
height
weight
treatment
0
33
M
159.2
57.2
A
1
48
F
155.8
52.9
B
2
22
M
175.2
96.2
B
3
29
M
178.7
79.4
A
4
31
F
160.1
50.2
B
5
23
F
157.8
60.1
B
6
11
M
165.5
60.3
A
7
24
M
174.9
74.5
B
8
18
M
181.0
71.3
A
9
41
F
160.6
72.7
B
filename = "https://ilovedata.github.io/teaching/bigdata2/data/drug.csv"
three_drug_wide = pd.read_csv(filename)
three_drug_wide
Placebo
Old
New
0
31
23
23
1
28
17
17
2
34
29
11
3
36
23
11
4
33
17
9
5
27
17
16
6
39
22
16
7
25
17
14
8
23
23
17
9
29
25
18
10
36
17
10
11
36
24
7
12
32
19
14
13
30
18
24
14
33
5
18
15
39
26
17
16
30
33
14
17
32
16
14
18
30
20
16
19
28
27
11
three_drug_long = three_drug_wide.melt(value_vars = ["Placebo" ,"Old" ,"New" ],value_name = "Value" ,var_name = "treatment" )
신약그룹이 기존약 그룹보다 효과가 있는가?
귀무가설 : 효과없다.
\[H_0 : \bar x_{old} - \bar x_{new} = 0 \]
데이터 관측 & 검정통계량 설정
\[\bar x_{old} - \bar x_{new} = 0 \]
def diff_mean(df,treatment):
twodrug = df.loc[(df.treatment == treatment[0 ]) | (df.treatment == treatment[1 ])].reset_index(drop= True ) #reset_index 반드시 해주기
group_mean = twodrug.groupby("treatment" ).mean().reset_index()
print (group_mean)
test_stat = float (group_mean.loc[group_mean.treatment == treatment[0 ],"Value" ]) - float (group_mean.loc[group_mean.treatment == treatment[1 ],"Value" ])
return test_stat
diff_mean(three_drug_long,["Old" ,"New" ])
treatment Value
0 New 14.85
1 Old 20.90
귀무가설 하에서의 분포 설정
cond = (three_drug_long.treatment == "Old" )| (three_drug_long.treatment == "New" )
twodrug = three_drug_long[cond].reset_index(drop= True )
random_permuted_treat = np.array(twodrug.treatment.sample(frac= 1.0 ,replace= False ))
two_drug_permuted = twodrug
two_drug_permuted["permuted_treat" ] = random_permuted_treat
two_drug_permuted
treatment
Value
permuted_treat
0
Old
23
Old
1
Old
17
New
2
Old
29
Old
3
Old
23
New
4
Old
17
New
5
Old
17
Old
6
Old
22
Old
7
Old
17
Old
8
Old
23
Old
9
Old
25
Old
10
Old
17
New
11
Old
24
Old
12
Old
19
Old
13
Old
18
New
14
Old
5
Old
15
Old
26
New
16
Old
33
New
17
Old
16
New
18
Old
20
New
19
Old
27
New
20
New
23
New
21
New
17
New
22
New
11
New
23
New
11
Old
24
New
9
Old
25
New
16
New
26
New
16
Old
27
New
14
Old
28
New
17
Old
29
New
18
Old
30
New
10
Old
31
New
7
New
32
New
14
New
33
New
24
Old
34
New
18
Old
35
New
17
New
36
New
14
New
37
New
14
New
38
New
16
New
39
New
11
Old